rm(list = ls())
gc(verbose = FALSE)

args = commandArgs(trailingOnly = "TRUE")
args

arg1 <- args[1]
arg2 <- args[2]

# Define library folder
setwd(arg1)
.libPaths(arg2)
.libPaths(.libPaths()[1])

pkg <- c("nnls", "SuperLearner", "dplyr", "arm", "onehot", "grf", "xgboost", "caret", "data.table",
          "knitr", "kableExtra", "magrittr", "kimisc", "data.table", "readstata13", "DescTools", "scales", "ggplot2")
lapply(pkg, library, character.only = T, lib.loc = arg2)


#####################################################################
#read the anonymized data
ihwap_merged = read.dta13(paste0(arg1, "/Data/ihwap_anonym.dta"))


####################################### data for predicting pre-treatment (counterfactual) usage
# drop some variables not to be used
ihwap_full = subset(ihwap_merged, select = c(Household, treated, total_mmbtu, HDD60, HDD65, CDD75,
                                           ExistingConsumption, ProjectedConsumption, sqfeet, noccupants, nwindows,
                                           nstories, builddate, CountyID, ShieldingClass, Blower_Pre, tbesetting,
                                           MainHeatFuel, HeatTypeMain, MainHeatBTU, Attic_RValue, Real_income,
                                           white, black, hispanic, asian, nativeamerican, otherrace, renter, female,
                                           Disabled, haselderly, hasminor, nbedrooms, audit_day, audit_month, audit_year,
                                           start_day, start_month, start_year, end_day, end_month, end_year, meterdays,
                                           priority, ProgramYear, tmin, tmax, precip, elec_prices, gas_prices))
ihwap_full = subset(ihwap_full, select = -c(audit_day, audit_month, audit_year, start_day, start_month, start_year))

# keeping only homes that heat with Gas
ihwap_full = ihwap_full[which(ihwap_full$MainHeatFuel=="1 - Natural Gas"), ]
ihwap_full = subset(ihwap_full, select = -c(MainHeatFuel))

# preprocessing the control variables
ihwap_full$MainHeatBTU_wins = Winsorize(ihwap_full$MainHeatBTU, na.rm=TRUE)
ihwap_full = subset(ihwap_full, select = -c(MainHeatBTU))

ihwap_full$hasattic = ifelse(ihwap_full$Attic_RValue>0, 1, 0)
ihwap_full$hasattic = ifelse(is.na(ihwap_full$hasattic)==TRUE, 0, ihwap_full$hasattic)
ihwap_full$hasattic = as.numeric(ihwap_full$hasattic)

categorical_vars = which(colnames(ihwap_full) %in% 
                           c("buildingdate", "noccupants", "ShieldingClass", "CountyID",
                             "nstories",  "Disabled", "HeatTypeMain", "tbesetting"))

ihwap_full[, categorical_vars] = lapply(ihwap_full[, categorical_vars], as.factor)
ihwap_full[, categorical_vars] = lapply(ihwap_full[, categorical_vars], as.numeric)

# droppping non-processed variables that have too many NAs, or are not relevant for ML
ihwap_full = subset(ihwap_full, select = -c(Attic_RValue))

# convert every variable to numeric
ihwap_full[,c(1:length(ihwap_full))] = lapply(ihwap_full[,c(1:length(ihwap_full))], as.numeric)

# keep only complete cases (dropping NAs from all variables)
ihwap_full = ihwap_full[complete.cases(ihwap_full), ]

# transform to data.table to facilitate some operations
ihwap_full = data.table(ihwap_full)

# defining folds for cross-validation
homes = unique(ihwap_full[, c("Household")])
set.seed(1234)
homes[, rand:=runif(.N, 0, 1)]
homes$kfold = ifelse(homes$rand<0.2,2,1)
homes$kfold = ifelse(homes$rand>=0.2&homes$rand<0.4,3,homes$kfold)
homes$kfold = ifelse(homes$rand>=0.4&homes$rand<0.6,4,homes$kfold)
homes$kfold = ifelse(homes$rand>=0.6&homes$rand<0.8,5,homes$kfold)

ihwap_full = merge(ihwap_full, homes[, c("Household", "kfold")], by="Household")

ihwap_full[, rand:=runif(.N, 0, 1)]
ihwap_full$kfold2 = ifelse(ihwap_full$rand<0.2,2,1)
ihwap_full$kfold2 = ifelse(ihwap_full$rand>=0.2&ihwap_full$rand<0.4,3,ihwap_full$kfold2)
ihwap_full$kfold2 = ifelse(ihwap_full$rand>=0.4&ihwap_full$rand<0.6,4,ihwap_full$kfold2)
ihwap_full$kfold2 = ifelse(ihwap_full$rand>=0.6&ihwap_full$rand<0.8,5,ihwap_full$kfold2)
ihwap_full = subset(ihwap_full, select = -c(rand))

setorder(ihwap_full, Household, end_year, end_month, end_day)
ihwap_full$rowid = seq.int(nrow(ihwap_full))

write.csv(ihwap_full, paste0(arg1, "/Data/ihwap_full.csv"), row.names = FALSE)
